import datetime as dt
import requests
from bs4 import BeautifulSoup
import re
from tqdm import tqdm

# 关键词
key_words = ["疫情", "非典", "病例", "sars", "SARS", "肺炎"]

# 设置日期池和地址池

# 测试日期用例
# start_date = dt.date(2003, 5, 11)
# end_date = dt.date(2003, 5, 13)

# 完整日期用例
start_date = dt.date(2002, 11, 1)
end_date = dt.date(2003, 9, 1)

delta = dt.timedelta(days=1)
now = start_date
dates = []
spider_urls = []
while now != end_date:
    dates.append(now.strftime("%Y%m%d"))
    spider_urls.append(f'https://news.sina.com.cn/head/news{now.strftime("%Y%m%d")}am.shtml')
    now += delta

# 设置日期和url对应的字典
dates_and_urls = dict(zip(spider_urls, dates))


# 爬取单日新闻首页
def crawl(url):
    # 发送请求, 获取响应
    response = requests.get(url)
    home_page = response.content.decode('GB18030')

    # 使用BeautifulSoup提取疫情数据
    soup = BeautifulSoup(home_page, 'lxml')
    a_tags = soup.find_all('a')

    # 爬取<a>标签内容和url
    titles = []
    news_urls = []
    r_title = r'>(.*?)</a>'
    r_url = r'<a href="(.*?)" '
    for a_tag in a_tags:
        if len(re.findall(r_title, str(a_tag))) > 0 and len(re.findall(r_url, str(a_tag))) > 0:
            for key_word in key_words:
                if key_word in re.findall(r_title, str(a_tag))[0]:
                    titles.append(re.findall(r_title, str(a_tag))[0])
                    news_urls.append(re.findall(r_url, str(a_tag))[0])
                    break

    # 去除<a>标签提取出的标题内的<font>标签
    for i in range(len(titles)):
        if titles[i][0] == "<":
            if titles[i][1] == "f":
                titles[i] = re.findall(r'>(.*?)</font>', titles[i])[0]

    # 删除<a>标签提取出的<img>标签并删除相关url
    temp_dic = dict(zip(titles, news_urls))
    for i in titles[:]:
        if i[0] == "<" and i[1] == "i":
            titles.remove(i)
            if i in temp_dic.keys():
                del temp_dic[i]

    return temp_dic


# 爬取单个新闻中的文本
def crawl_text(url):
    # 发送请求, 获取响应
    response = requests.get(url)
    home_page = response.content.decode('GB18030', 'ignore')

    # 使用BeautifulSoup
    soup = BeautifulSoup(home_page, 'lxml')

    # 从soup中找<p>标签
    p_tags = soup.find_all('p')

    paragraphs = []
    r_text = r'<p>(.*?)</p>'
    for p in p_tags:
        if len(re.findall(r_text, str(p))) > 0:
            if len(re.findall(r_text, str(p))[0]) > 2:
                if re.findall(r_text, str(p))[0][2] != '<':
                    text = re.findall(r_text, str(p))[0]
                    text = re.sub(r'<.*?>', '', text)
                    paragraphs.append(text)

    return paragraphs


# 爬取所有日期的新闻，用dic储存
dic = {}
for spider_url in tqdm(spider_urls, "爬取url进度"):
    # 获取当前日期
    today = dates_and_urls[spider_url]
    titles_and_news_urls = crawl(spider_url)
    dic[today] = titles_and_news_urls

text_list = []
for key in tqdm(dic.keys(), "爬取文本进度"):
    today_date = key
    today_news_dic = dic[key]
    for _ in today_news_dic:
        title = _
        url = today_news_dic[title]

        try:
            text_list = crawl_text(url)
        except:
            print(url, '爬取失败')

        if len(text_list) > 0:
            try:
                with open(f'articles/{today_date}_{title}.txt', 'w', encoding='utf8') as file:
                    file.write(f'文章来源：{url}\n')
                    for text in text_list:
                        file.write(text)
                        file.write('\n')
            except:
                print(title, '不能作为文件标题')
